The aim of this notebook is to make use of the word2vec model to find similar songs
import pandas as pd
import numpy as np
import gensim.models.word2vec as w2v
import multiprocessing
import os
import re
import pprint
import sklearn.manifold
import matplotlib.pyplot as plt
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
#To plot in collab:
def configure_plotly_browser_state():
import IPython
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
<script>
requirejs.config({
paths: {
base: '/static/base',
plotly: 'cdn.plot.ly/plotly-latest.min.js?noext',
},
});
</script>
'''))
#call this function en cada celda que hagas un plot
Though non english artists were removed, the dataset contained Hindi lyrics of Lata Mangeshkar written in English. Therefore, I decided to remove all songs sung by her.
songs = pd.read_csv("songlyrics/songdata.csv", header=0)
#songs.head()
songs = songs[songs.artist != 'Lata Mangeshkar']
songs.head()
To train the word2vec model, we first need to build its vocabulary. To do that, I iterated over each song and added it to an array that can later be fed to the model.
text_corpus = []
for song in songs['text']:
words = song.lower().split()
text_corpus.append(words)
# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 50
# Minimum word count threshold.
min_word_count = 1
# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()
# Context window length.
context_size = 7
downsampling = 1e-1
# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1
songs2vec = w2v.Word2Vec(
sg=1,
seed=seed,
workers=num_workers,
size=num_features,
min_count=min_word_count,
window=context_size,
sample=downsampling
)
songs2vec.build_vocab(text_corpus)
print (len(text_corpus))
import time
start_time = time.time()
songs2vec.train(text_corpus, total_examples=songs2vec.corpus_count, epochs=2)
if not os.path.exists("trained"):
os.makedirs("trained")
songs2vec.save(os.path.join("trained", "songs2vectors.w2v"))
print("--- %s seconds ---" % (time.time() - start_time))
songs2vec = w2v.Word2Vec.load(os.path.join("trained", "songs2vectors.w2v"))
Find similar words
songs2vec.wv.most_similar("love")
songs2vec.wv.most_similar("fuck")
songs2vec.wv.most_similar("coffee")
songs2vec.wv.most_similar("espresso")
Words out of context
songs2vec.wv.doesnt_match("happiness love joy hate".split())
songs2vec.wv.doesnt_match("breakfast milk lunch dinner".split())
songs2vec.wv.doesnt_match("morning evening night sunday".split())
songs2vec.wv.doesnt_match("high low jump".split())
songs2vec.most_similar(positive=['woman', 'king'], negative=['man'])
#queen
songs2vec.most_similar(positive=['gin', 'whiskey'], negative=['eggs'])
Semantic distance between words
def nearest_similarity_cosmul(start1, end1, end2):
similarities = songs2vec.wv.most_similar_cosmul(
positive=[end2, start1],
negative=[end1]
)
start2 = similarities[0][0]
print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))
nearest_similarity_cosmul("paris", "france", "alabama")
nearest_similarity_cosmul("paris", "france", "london")
nearest_similarity_cosmul("happy", "sad", "alone")
nearest_similarity_cosmul("near", "far", "london")
With the word vector embeddings in place, it is now time to calculate the normalised vector sum of each song. This process can take some time since it has to be done for each of 57,000 songs.
print(songs2vec['un-right'])
def songVector(row):
vector_sum = 0
words = row.lower().split()
for word in words:
vector_sum = vector_sum + songs2vec[word]
vector_sum = vector_sum.reshape(1,-1)
normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
return normalised_vector_sum
import time
start_time = time.time()
songs['song_vector'] = songs['text'].apply(songVector)
t-sne and random song selection
The songs have 50 dimensions each. Application of t-sne is memory intensive and hence it is slightly easier on the computer to use a random sample of the 57,000 songs.
song_vectors = []
from sklearn.model_selection import train_test_split
train, test = train_test_split(songs, test_size = 0.9)
for song_vector in train['song_vector']:
song_vectors.append(song_vector)
train.head(10)
I had a fairly measly 4gb machine and wasn't able to generate a more accurate model. However, one can play around with the number of iterations, learning rate and other factors to fit the model better. If you have too many dimensions (~300+), it might make sense to use PCA first and then t-sne.
X = np.array(song_vectors).reshape((5761, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=250, random_state=0, verbose=2)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
Joining two dataframes to obtain each song's corresponding X,Y co-ordinate.
two_dimensional_songs = pd.concat([train, df], axis=1)
two_dimensional_songs.head()
Plotting the results
Using plotly, I plotted the results so that it becomes easier to explore similar songs based on their colors and clusters.
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_songs['Y'],
x = two_dimensional_songs['X'],
text = two_dimensional_songs['song'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
import plotly.express as px
df = px.data.iris()
print(df)
print(type(df))
#fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
# color='species')
#fig.show()
###plot cluster by ARTIST
print(type(two_dimensional_songs))
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x='X', y='Y',color='artist')
fig.show()
import plotly.graph_objects as go
import numpy as np
fig = go.Figure(data=go.Scatter(
y = two_dimensional_songs['Y'],
x = two_dimensional_songs['X'],
text = two_dimensional_songs['artist']+ "_"+two_dimensional_songs['song'] ,
mode='markers',
marker=dict(
size= 10,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
))
fig.show()
## LOOK FOR COMMON SONGS AND ANALYZE THE TEXT
import plotly.graph_objects as go
import numpy as np
fig = go.Figure(data=go.Scatter(
y = np.random.randn(500),
mode='markers',
marker=dict(
size=16,
color=np.random.randn(500), #set color equal to a variable
colorscale='Viridis', # one of plotly colorscales
showscale=True
)
))
fig.show()
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
color='species')
fig.show()
X = np.array(song_vectors).reshape((5761, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=2000, random_state=0, verbose=2, learning_rate=1000)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_songs = pd.concat([train, df], axis=1)
two_dimensional_songs.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_songs['Y'],
x = two_dimensional_songs['X'],
text = two_dimensional_songs['song'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
###plot cluster by ARTIST
print(type(two_dimensional_songs))
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x='X', y='Y',color='artist')
fig.show()
Learning rate muy algo, probamos a bajarlo
X = np.array(song_vectors).reshape((5761, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=2000, random_state=0, verbose=2, learning_rate=500)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_songs = pd.concat([train, df], axis=1)
two_dimensional_songs.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_songs['Y'],
x = two_dimensional_songs['X'],
text = two_dimensional_songs['song'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
###plot cluster by ARTIST
print(type(two_dimensional_songs))
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x='X', y='Y',color='artist')
fig.show()
Aumentamos el número de iteraciones
X = np.array(song_vectors).reshape((5761, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=8000, random_state=0, verbose=2, learning_rate=400)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_songs = pd.concat([train, df], axis=1)
two_dimensional_songs.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_songs['Y'],
x = two_dimensional_songs['X'],
text = two_dimensional_songs['song'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
###plot cluster by ARTIST
print(type(two_dimensional_songs))
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x='X', y='Y',color='artist')
fig.show()
reviews = pd.read_csv("IMDB Dataset.csv", header=0)
#songs.head()
reviews.head()
text_corpus_IMBD = []
for review in reviews['review']:
words_review = review.lower().split()
text_corpus_IMBD.append(words_review)
# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 50
# Minimum word count threshold.
min_word_count = 1
# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()
# Context window length.
context_size = 7
downsampling = 1e-1
# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1
reviews2vec = w2v.Word2Vec(
sg=1,
seed=seed,
workers=num_workers,
size=num_features,
min_count=min_word_count,
window=context_size,
sample=downsampling
)
reviews2vec.build_vocab(text_corpus_IMBD)
print (len(text_corpus_IMBD))
import time
start_time = time.time()
reviews2vec.train(text_corpus_IMBD, total_examples=reviews2vec.corpus_count, epochs=2)
if not os.path.exists("trained"):
os.makedirs("trained")
reviews2vec.save(os.path.join("trained", "reviews2vectors.w2v"))
print("--- %s seconds ---" % (time.time() - start_time))
reviews2vec = w2v.Word2Vec.load(os.path.join("trained", "reviews2vectors.w2v"))
reviews2vec.wv.most_similar("amazing")
reviews2vec.wv.most_similar("thrilling")
reviews2vec.wv.most_similar("strange")
def nearest_similarity_cosmul(start1, end1, end2):
similarities = reviews2vec.wv.most_similar_cosmul(
positive=[end2, start1],
negative=[end1]
)
start2 = similarities[0][0]
print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))
nearest_similarity_cosmul("funny", "comedy", "horror")
nearest_similarity_cosmul("thrilling", "thriller", "suspense")
Probamos con nuevos ejemplos de funciones, como la comprobación de cercania. Esta funcion muestra las palabras del corpus que están más cerca de bad de lo que está good.
reviews2vec.wv.closer_than("bad", "good")
reviews2vec.wv.closer_than("interesting", "thrilling")
En el caso de la siguiente funcion (rank) muetra el rango de distancia entre las palabras
reviews2vec.wv.rank("funny", "sad")
Calculo el vector suma normalizado de cada crÃtica
print(reviews2vec['thrilling'])
def reviewVector(row):
vector_sum = 0
words = row.lower().split()
for word in words:
vector_sum = vector_sum + reviews2vec[word]
vector_sum = vector_sum.reshape(1,-1)
normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
return normalised_vector_sum
import time
start_time = time.time()
reviews['review_vector'] = reviews['review'].apply(reviewVector)
t-sne and random song selection
Los comentarios tienen 50 dimensiones cada uno. Application of t-sne is memory intensive and hence it is slightly easier on the computer to use a random sample of the 57,000 songs.
review_vectors = []
from sklearn.model_selection import train_test_split
train, test = train_test_split(reviews, test_size = 0.9)
for review_vector in train['review_vector']:
review_vectors.append(review_vector)
train.head(10)
He comenzado con los mismos valores para los parametros que en el ejemplo anterior y he ido adaptando para obtener mejores resultados. A continuación se muestran algunos ejemplos de resultados
X = np.array(review_vectors).reshape((5000, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=10000, random_state=0, verbose=2, learning_rate=700)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_reviews = pd.concat([train, df], axis=1)
two_dimensional_reviews.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_reviews['Y'],
x = two_dimensional_reviews['X'],
text = two_dimensional_reviews['review'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
A continuación se muestran las crÃticas, destacando con el color si el sentimiento de estas es positivo o negativo
print(type(two_dimensional_reviews))
import plotly.express as px
fig = px.scatter(two_dimensional_reviews, x='X', y='Y',color='sentiment')
fig.show()
Cambiando perplexity
X = np.array(review_vectors).reshape((5000, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=10000, random_state=0, verbose=2, learning_rate=700, perplexity = 40)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_reviews = pd.concat([train, df], axis=1)
two_dimensional_reviews.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_reviews['Y'],
x = two_dimensional_reviews['X'],
text = two_dimensional_reviews['review'],
mode='markers',
marker=dict(
size= 5,#'7',
color = np.random.randn(5717), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
print(type(two_dimensional_reviews))
import plotly.express as px
fig = px.scatter(two_dimensional_reviews, x='X', y='Y',color='sentiment')
fig.show()
X = np.array(review_vectors).reshape((5000, 50))
start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=5000, random_state=0, verbose=2, perplexity = 40)
all_word_vectors_matrix_2d = tsne.fit_transform(X)
print("--- %s seconds ---" % (time.time() - start_time))
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])
df.head(10)
train.head()
df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
two_dimensional_reviews = pd.concat([train, df], axis=1)
two_dimensional_reviews.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_reviews['Y'],
x = two_dimensional_reviews['X'],
text = two_dimensional_reviews['review'],
mode='markers',
marker=dict(
size= 5,#'7',
color = two_dimensional_reviews['X'], #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)
print(type(two_dimensional_reviews))
import plotly.express as px
fig = px.scatter(two_dimensional_reviews, x='X', y='Y',color='sentiment')
fig.show()
two_dimensional_reviews["sentiment"] = two_dimensional_reviews["sentiment"].astype('category')
two_dimensional_reviews["sentiment_cat"] = two_dimensional_reviews["sentiment"].cat.codes
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
trace1 = go.Scatter(
y = two_dimensional_reviews['Y'],
x = two_dimensional_reviews['X'],
text = two_dimensional_reviews['review'],
mode='markers',
marker=dict(
size= 5,#'7',
color = two_dimensional_reviews['sentiment_cat'], #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
data = [trace1]
iplot(data)